setup R-environment and load/prepare data

knitr::opts_chunk$set(echo = TRUE)

if (knitr::is_latex_output()) {
  MODE <- "PDF"
} else if (knitr::is_html_output()) {
  MODE <- "HTML"
} else {
  MODE <- "OTHER"
}

print(MODE)
## [1] "HTML"

load all packages required

library(cowplot)
library(ggforce)
library(ggpubr)
library(ggrepel)
library(knitr)
library(paletteer)
library(ggalt)
library(plotly)
library(ggbeeswarm)
library(scico)
library(cividis)
library(ggrastr)
library(tools)
library(mgcv)  # For fitting GAM model
library(h2o)
library(Metrics)
library(DescTools)
library(tidyverse)

# library(PCAtools)
# library(gridExtra)
# library(forcats)
library(ggrastr)
# library(DescTools)


theme_set(theme_cowplot(12))

############################################################################################### ############################################################################################### ############################################################################################### # ########################### gypsy27 sensor ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # gypsy27 sensor

load data

# load data
RAWhist = read_tsv("gypsy27.bg", col_names = TRUE)%>%
  {}

RAWstats = read_tsv("ALL.stats.gypsy27.txt", col_names = TRUE)%>%
  {}

RAW = left_join(RAWhist, RAWstats, by = c("libNAME"))%>%
  {}


UTRstart = RAW %>%
  summarise(
    UTRstart = mean(UTRstart)
  )%>%
  pull(UTRstart)%>%
  {}

UTRend = RAW %>%
  summarise(
    UTRend = max(UTRend)
  )%>%
  pull(UTRend)%>%
  {}

plot sRNA histogram

NORM="COUNT_QPCRnorm"

PLOTtable <- RAW %>%
  filter(!str_detect(libNAME, "siYB"),
         !str_detect(libNAME, "ExoSap")) %>%
  filter(str_detect(libNAME, "clone")) %>%
  group_by(SENSOR, POSITION) %>%
  summarise(
    minVAL    = min(.data[[NORM]], na.rm = TRUE),
    maxVAL    = max(.data[[NORM]], na.rm = TRUE),
    mean_value = mean(.data[[NORM]], na.rm = TRUE),
    sd        = sd(.data[[NORM]], na.rm = TRUE),
    se        = sd / sqrt(n()),
    ci_upper  = mean_value + 1.96 * se,
    ci_lower  = mean_value - 1.96 * se,
    .groups   = "drop"
  )

p= PLOTtable %>%
    ggplot( aes(x = POSITION, y = mean_value, color = SENSOR,fill=SENSOR)) +
    geom_line() +
  geom_ribbon(aes(ymin = minVAL, ymax = maxVAL), alpha = 0.2, color = NA) +
    labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRstart+1125, linetype = "dashed", color = "red", size=0.5) +
  geom_vline(xintercept=UTRstart+3141, linetype = "dashed", color = "red", size=0.5) +
  scale_color_manual(values = c("#0072B2", "#E69F00")) +
  scale_fill_manual(values = c("#0072B2", "#E69F00")) +
  scale_y_continuous(expand = expansion(mult = c(0.01, NA)))+
  theme_cowplot(14) +
  theme(
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black"),
    axis.line.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.length.x = unit(0, "mm"), # Remove tick length
        plot.margin = margin(5.5, 5.5, 1.5, 5.5, "pt")
  ) +
{}
  
print(p)

ggsave("gypsy27_histogram.pdf", p, width = 8, height = 2.5)

plot fold change along the sensor

PLOTtable2 = PLOTtable %>% 
  select(POSITION,mean_value, SENSOR)%>%
  pivot_wider(names_from = SENSOR, values_from = mean_value)%>%
  mutate(
    foldchange = `PLH202-DL_Gypsy27U`/(`PLH203-DL_Gypsy27A`)
  )%>%
  filter(POSITION>=UTRstart-50, POSITION<=UTRend+50)%>%
  {}

p= PLOTtable2 %>%
    ggplot( aes(x = POSITION, y = foldchange)) +
    geom_smooth(method="loess",span=0.01, se = FALSE,color="black" )+
  scale_y_log10()+
  scale_x_continuous(limits = c(0, NA))+
  geom_hline(yintercept = 1, color = "black", size=0.5) +
    geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRstart+1125, linetype = "dashed", color = "red", size=0.5) +
  geom_vline(xintercept=UTRstart+3141, linetype = "dashed", color = "red", size=0.5) 



print (p)

  ggsave("gypsy27_histogram.foldchange.pdf", p, width = 8, height = 1.5)

plot nucleotide-content

PLOTtable = RAW %>%
  filter(!str_detect(libNAME,"siYB"), !str_detect(libNAME,"ExoSap"))%>%
  filter(str_detect(libNAME,"clone") )%>%
  group_by(SENSOR,POSITION) %>%
  summarise(
    minVAL= min(NUC_T),  
    maxVAL= max(NUC_T),
    mean_value = mean(NUC_T),
    sd = sd(NUC_T),
    se = sd / sqrt(n()),  # Standard error
    ci_upper = mean_value + 1.96 * se,  # 95% confidence interval
    ci_lower = mean_value - 1.96 * se
  )
## `summarise()` has grouped output by 'SENSOR'. You can override using the
## `.groups` argument.
p= PLOTtable %>%
    arrange(SENSOR != "PLH203-DL_Gypsy27A")%>%
    ggplot( aes(x = POSITION, y = mean_value, color = SENSOR,fill=SENSOR)) +
    geom_smooth(method="loess", span=0.01, se = FALSE, size=0.8) +
      labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRstart+1125, linetype = "dashed", color = "red", size=0.5) +
  geom_vline(xintercept=UTRstart+3141, linetype = "dashed", color = "red", size=0.5) +
  scale_color_manual(values = c("black", "#E69F00")) +
  scale_fill_manual(values = c("black", "#E69F00")) +
  coord_cartesian(ylim=c(10,60)) +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
{}
  
print(p)
## `geom_smooth()` using formula = 'y ~ x'

ggsave("gypsy27_histogram.nucCONT.pdf", p, width = 8, height = 2)
## `geom_smooth()` using formula = 'y ~ x'

############################################################################################### ############################################################################################### ############################################################################################### # ########################### BoxB tethering sensor ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # BoxB tethering sensor

load data

# load data
RAWhist = read_tsv("BoxB.bg", col_names = TRUE)%>%
  {}

RAWstats = read_tsv("ALL.stats.BoxB.txt", col_names = TRUE)%>%
  {}

RAW = left_join(RAWhist, RAWstats, by = c("libNAME"))%>%
  {}


UTRstart = RAW %>%
  summarise(
    UTRstart = mean(UTRstart)
  )%>%
  pull(UTRstart)%>%
  {}

UTRend = RAW %>%
  summarise(
    UTRend = max(UTRend)
  )%>%
  pull(UTRend)%>%
  {}

plot sRNA histogram

plotTABLE = RAW %>%
  filter(!str_detect(libNAME,"siYB"), !str_detect(libNAME,"ExoSap"))%>%
  separate(libNAME, c("SENSORid","CLONE","EXPtype"), sep = "_",remove=FALSE)%>%
  separate(SENSOR, c("PLASMID","nBoxB"), sep = "_",remove=FALSE)%>%
  group_by(EXPtype,POSITION,nBoxB) %>%
  summarise(
    minVAL= min(COUNT_piRNAnorm),  
    maxVAL= max(COUNT_piRNAnorm),
    mean_value = mean(COUNT_piRNAnorm),
    sd = sd(COUNT_piRNAnorm),
    se = sd / sqrt(n()),  # Standard error
    ci_upper = mean_value + 1.96 * se,  # 95% confidence interval
    ci_lower = mean_value - 1.96 * se
  )

n_sensors= plotTABLE %>%
  select(EXPtype)%>%
  unique()%>%
  nrow()
n_sensors = n_sensors - 1 
# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
sensor_colors <- c("black",okabe_ito[1:n_sensors])

p= plotTABLE %>%
  filter(nBoxB == "2xBoxB")%>%
    ggplot( aes(x = POSITION, y = mean_value, color = EXPtype ,fill=EXPtype)) +
    geom_line() +
  geom_ribbon(aes(ymin = minVAL, ymax = maxVAL), alpha = 0.2, color = NA) +
    labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  scale_color_manual(values = sensor_colors) +
  scale_fill_manual(values = sensor_colors) +
  facet_col(~nBoxB)+
  coord_cartesian(ylim=c(0,12.5)) +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "inside",
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
  {}
print(p)

ggsave("BoxB_histogram.2x.pdf", p, width =6.5, height = 5)

p= plotTABLE %>%
  filter(nBoxB == "4xBoxB")%>%
    ggplot( aes(x = POSITION, y = mean_value, color = EXPtype ,fill=EXPtype)) +
    geom_line() +
  geom_ribbon(aes(ymin = minVAL, ymax = maxVAL), alpha = 0.2, color = NA) +
    labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  scale_color_manual(values = sensor_colors) +
  scale_fill_manual(values = sensor_colors) +
  facet_col(~nBoxB)+
  coord_cartesian(ylim=c(0,12.5)) +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "inside",
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
  {}
print(p)

ggsave("BoxB_histogram.4x.pdf", p, width =6.5, height = 5)

## fold change along the sensor

PLOTtable2 = plotTABLE %>% 
  ungroup()%>%
  mutate(
    ID = paste(EXPtype,nBoxB,sep="_")
  )%>%
  select(POSITION,mean_value, ID)%>%
 pivot_wider(names_from = ID, values_from = mean_value)%>%
  mutate(
    foldchange = `LN-YB_4xBoxB`/`onlyYB_4xBoxB`
  )%>%
  filter(POSITION>=UTRstart-50, POSITION<=UTRend+50)%>%
  {}

p= PLOTtable2 %>%
    ggplot( aes(x = POSITION, y = foldchange)) +
    geom_smooth(method="loess",span=0.01, se = FALSE,color="black" )+
  scale_y_log10()+
  scale_x_continuous(limits = c(0, NA))+
  geom_hline(yintercept = 1, color = "black", size=0.5) +
    geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRstart+1100, linetype = "dashed", color = "red", size=0.5) 



print (p)

  ggsave("BoxB_histogram.foldchange.pdf", p, width = 8, height = 1.5)

############################################################################################### ############################################################################################### ############################################################################################### # ########################### tj-CIS in U20 sensor ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # tj-CIS in U20 sensor

load data

# load data
RAWhist = read_tsv("tjCIS_U20.bg", col_names = TRUE)%>%
  {}

RAWstats = read_tsv("ALL.stats.tjCIS_U20.txt", col_names = TRUE)%>%
  {}

RAW_tjCIS = left_join(RAWhist, RAWstats, by = c("libNAME"))%>%
  {}

RAW_U20 = read_tsv("Uramp-sensor.bg", col_names = TRUE)%>%
  filter(str_detect(libNAME,"U20"))%>%
  {}

RAWstats_U20 = read_tsv("ALL.stats.Uramp.txt", col_names = TRUE)%>%
  filter(str_detect(libNAME,"U20"))%>%
  {}

RAW_U20 = left_join(RAW_U20, RAWstats_U20, by = c("libNAME"))%>%
  {}

#concatenate RAW and RAW_U20
RAW = RAW_tjCIS %>%
  bind_rows(RAW_U20)%>%
  {}

UTRstart = RAW %>%
  summarise(
    UTRstart = mean(UTRstart)
  )%>%
  pull(UTRstart)%>%
  {}

UTRend = RAW %>%
  summarise(
    UTRend = max(UTRend)
  )%>%
  pull(UTRend)%>%
  {}

plot sRNA histogram

VERSION="COUNT_miRNAnorm"

plotTABLE = RAW %>%
  # filter(!str_detect(libNAME,"siYB"))%>%
  separate(libNAME, c("SENSORid","CLONE","EXPtype"), sep = "_",remove=FALSE)%>%
  mutate(
    CONDITION=case_when(
      str_detect(libNAME,"siYB") ~ "siYB",
      TRUE ~ "siLuc"
    ),
    CIS = case_when(
      str_detect(libNAME,"shuffle") ~ "SHUFFLE",
      str_detect(SENSOR,"Uramp") ~ "U20",
      str_detect(SENSOR,"CIS-wt") ~ "tj-CIS",
      TRUE ~ "NA"
      # TRUE ~ libNAME
    )
  )%>%
  group_by(POSITION,CONDITION,CIS) %>%
  summarise(
    minVAL= min(.data[[VERSION]]),  
    maxVAL= max(.data[[VERSION]]),
    mean_value = mean(.data[[VERSION]]),
    sd = sd(.data[[VERSION]]),
    se = sd / sqrt(n()),  # Standard error
    ci_upper = mean_value + 1.96 * se,  # 95% confidence interval
    ci_lower = mean_value - 1.96 * se
  )%>%
  filter(CIS=="SHUFFLE" | CIS=="U20" | str_detect(CIS,"tj-CIS"))%>%
      mutate( 
      POSITION = case_when(
        CIS == "U20" & POSITION>UTRstart+984 ~ POSITION+100,
        TRUE ~ POSITION
      ),
      mean_value= case_when(
        CIS != "SHUFFLE" & POSITION>UTRstart+984 & POSITION<UTRstart+984+100 ~ 0,
        TRUE ~ mean_value
      ),
      maxVAL= case_when(
        CIS != "SHUFFLE" & POSITION>UTRstart+984 & POSITION<UTRstart+984+100 ~ 0,
        TRUE ~ maxVAL
      ),
      minVAL= case_when(
        CIS != "SHUFFLE" & POSITION>UTRstart+984 & POSITION<UTRstart+984+100 ~ 0,
        TRUE ~ minVAL
      )

      )%>%
  {}

n_sensors= plotTABLE %>%
  select(CONDITION)%>%
  unique()%>%
  nrow()
n_sensors = n_sensors - 1 
# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
sensor_colors <- c("black",okabe_ito[1:n_sensors])

p= plotTABLE %>%
    ggplot( aes(x = POSITION, y = mean_value, color = CIS ,fill=CIS)) +
    geom_line() +
  geom_ribbon(aes(ymin = minVAL, ymax = maxVAL), alpha = 0.2, color = NA) +
    labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  annotate('rect', xmin=UTRstart+984, xmax=UTRstart+1084, ymin=- Inf, ymax=Inf, alpha=.2, fill='#cc79a7')+
  scale_color_manual(values = sensor_colors) +
  scale_fill_manual(values = sensor_colors) +
  facet_grid(CIS~CONDITION)+
  # scale_y_continuous(limits = c(0, 60)) +
  # coord_cartesian(ylim=c(0,12.5)) +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    #no legend
    legend.position="none",
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
  {}
print(p)

ggsave("tjCIS_in_U20.pdf", p, width =6.5, height = 5)

plot nucleotide-content

PLOTtable = RAW %>%
  filter(!str_detect(libNAME,"siYB"), !str_detect(libNAME,"ExoSap"))%>%
  filter(str_detect(libNAME,"clone") )%>%
  group_by(SENSOR,POSITION) %>%
  summarise(
    minVAL= min(NUC_T),  
    maxVAL= max(NUC_T),
    mean_value = mean(NUC_T),
    sd = sd(NUC_T),
    se = sd / sqrt(n()),  # Standard error
    ci_upper = mean_value + 1.96 * se,  # 95% confidence interval
    ci_lower = mean_value - 1.96 * se
  )


p= PLOTtable %>%
    arrange(SENSOR != "PLH203-DL_Gypsy27A")%>%
    ggplot( aes(x = POSITION, y = mean_value, color = SENSOR,fill=SENSOR)) +
    geom_smooth(method="loess", span=0.01, se = FALSE, size=0.8) +
      labs(x = "Position on sensor",
         y = "QPCR normalized sRNA count")+
  geom_vline(xintercept=UTRstart, linetype = "dashed", color = "grey", size=0.5) +
  geom_vline(xintercept=UTRend, linetype = "dashed", color = "grey", size=0.5) +
  annotate('rect', xmin=UTRstart+984, xmax=UTRstart+1084, ymin=- Inf, ymax=Inf, alpha=.2, fill='#cc79a7')+
  scale_color_manual(values = c("black", "#E69F00")) +
  scale_fill_manual(values = c("black", "#E69F00")) +
  coord_cartesian(ylim=c(10,60)) +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
{}
  
print(p)

ggsave("tjCIS_in_U20_histogram.nucCONT.pdf", p, width = 8, height = 2)

###############################################################################################

############################################################################################### ############################################################################################### ############################################################################################### # #################### U-UU simulation ####################################### ############################################################################################### ############################################################################################### ###############################################################################################

RAW = read_tsv("U_vs_UU.simulation.txt")%>%
  {}

RAW %>% 
  filter(! grepl("total", `#seq` ))%>%
  ggplot(aes(x=T, y=TT))+
  geom_point()+
  geom_smooth()

############################################################################################### ############################################################################################### ############################################################################################### # #################### parameter sweep analysis biogenesis-efficiency ####################################### ############################################################################################### ############################################################################################### ###############################################################################################

plot the 100nt tile-size for the figure - Nucleotide bias

# load data
RAW = read_tsv("biogEfficiency.RNAnorm.siWT.ds500.shared.final_for-figures_2.corr.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))
## New names:
## Rows: 378 Columns: 273
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (6): TYPE, ID...17, ID...66, ID...259, ID...266, ID...273 dbl (266): TILEsize,
## TILEshift, nucTILEsize, pearson_NUC_A, pearson_NUC_C, p... lgl (1):
## pearson_RNAnorm_nucREGION_CLIP_CLIP_YB.all
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `ID...15` -> `ID...17`
## • `ID...64` -> `ID...66`
## • `ID...257` -> `ID...259`
## • `ID...264` -> `ID...266`
## • `ID...271` -> `ID...273`
  # load data
for (currTYPE in c("UTR","CDS","CLUSTER")){
    
    currTABLE = RAW %>%
      filter(TYPE == currTYPE)
      {}
    
    STATs= currTABLE %>% 
        select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"))%>%
        pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
        separate(name, c("STATtype","RNAnorm","nucTILE","CLIP","CLIP2","CLIP_PROT"), sep = "_", remove = FALSE)%>%
        select(STATtype)%>%
        unique()%>%
        unlist()
    
    TILEsizes= currTABLE %>%
      select(TILEsize)%>%
      unique()
    
    SELECTsize=650
    SELECTshift=-300
    
    STATs="kendall"
    for (METHOD in STATs){
        MIN=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%min()*1.1
        MAX=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%max()*1.1
  
      X=currTABLE %>%
        select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),TYPE)%>%
        # select(-TYPE, -contains("ID"))%>%
        pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE"),TYPE))%>%
        filter(TILEsize==100)%>%
        separate(name, c("STATtype","RNAnorm","nucTILE","CLIP","CLIP2","CLIP_PROT"), sep = "_", remove = FALSE)%>%
        # mutate(name = case_when(
        #   str_detect(name, "NUC_T") ~ "NUC_U" ,
        #   TRUE ~ paste(NUC,NUCid,sep="_")
        # ))%>%
        filter( STATtype == METHOD)
    
  
      maxLINE= X %>% 
        group_by(TILEsize,TYPE)%>%
        filter(TYPE==currTYPE)%>%
        filter(value==max(value))%>%
        #round number
        mutate(value=round(value,4))
    
      selectLINE = X %>%
        filter(nucTILEsize==SELECTsize & TILEshift==SELECTshift)%>%
        mutate(value=round(value,4))
      
      p=X %>%
        ggplot(aes(y=nucTILEsize, x=TILEshift))+
        # geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value), binwidth=0.025 )+
        geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
        # geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
        geom_text(data = maxLINE, 
              aes(y = nucTILEsize, x = TILEshift, label = value), 
              color = "black", 
              size = 2, 
              vjust = 1.5) +  # Adjust position vertically
        geom_text(data=selectLINE,
              aes(y = nucTILEsize, x = TILEshift, label = round(value,3)), 
              color = "black", 
              size = 0.5, 
              vjust = 1.5) +  # Adjust position vertically
        # facet_grid(TILEsize~name) +
        # facet_wrap(~TYPE)+
        # geom_point_rast(aes(x=SELECTshift,y=SELECTsize), color="green")+
        geom_vline(xintercept = 00, linetype =  3)+
        # geom_vline(xintercept = -420, linetype = 2)+
        geom_hline(yintercept = 100, linetype = 3)+
        labs(title=paste("CLIP.",currTYPE,sep=""), 
             y="window used to evaluate CLIP signal [nt]",
             x="start position shift relative to the piRNA tile [nt]",
        )+
        theme_cowplot(12)+
        theme(
            # legend.position = "none",
            axis.text = element_text(size=12),
            axis.text.x = element_text(angle = 90, hjust = 1),
            axis.title = element_text(size=10),
            strip.text = element_text(size=10),
            panel.grid.minor = element_blank(),
            panel.grid.major = element_blank(),
            aspect.ratio = 1
      
        )+
        # scale_fill_scico(limits=c(MIN,MAX), midpoint=0, palette = "vik", direction = 1 )
        scale_fill_scico( palette = "vik", direction = 1 )
    
      print(p)
       
       ggsave(paste("parameter-sweep.BiogEFF_vs_YB-CLIP",currTYPE,".pdf",sep=""), p, height = 100, width = 100, units = "mm", dpi = 300)
    }
}

## Warning in min(structure(logical(0), dim = 0:1, dimnames = list(NULL,
## "value")), : no non-missing arguments to min; returning Inf
## Warning in max(structure(logical(0), dim = 0:1, dimnames = list(NULL,
## "value")), : no non-missing arguments to max; returning -Inf
## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `value == max(value)`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf

## Warning in min(structure(logical(0), dim = 0:1, dimnames = list(NULL,
## "value")), : no non-missing arguments to min; returning Inf
## Warning in max(structure(logical(0), dim = 0:1, dimnames = list(NULL,
## "value")), : no non-missing arguments to max; returning -Inf
## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `value == max(value)`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf

plot the biogenesisEFF heatmap - Nucleotide bias

#extract available statistical methods
STATs= RAW %>% 
    select(contains("TILE"), contains("_NUC_"))%>%
    pivot_longer(-contains("TILE"))%>%
    separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
    select(STATtype)%>%
    unique()%>%
    unlist()

#fix certain parameters
currSIZE=100
METHOD="kendall"
BINWIDTH=0.025

#functions to round up or down to the nearest multiple
round_up_to <- function(x, multiple) {
  ceiling(x / multiple) * multiple
}
round_down_to <- function(x, multiple) {
  floor(x / multiple) * multiple
}

for (currTYPE in c("UTR")){
  
  currTABLE = RAW %>%
    filter(TYPE == currTYPE)
    {}

  #determine the limits for the current filter
  LIMITS=currTABLE %>%
    select(contains("TILE"), contains("_NUC_"))%>%
    pivot_longer(-contains("TILE"))%>%
    separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
    filter(STATtype == METHOD & TILEsize==currSIZE )%>%
    summarise(minVAL=min(value), maxVAL=max(value))
  #round to the nearest binwidth
  maxVAL=round_up_to(LIMITS$maxVAL,BINWIDTH)
  minVAL=round_down_to(LIMITS$minVAL,BINWIDTH)
  BREAKS=seq(minVAL,maxVAL,by=BINWIDTH)
  
  #loop through all the nucleotides to get individual plots
  for (currNUC in c("A","C","G","T")){
    #extract the relevant data for the current plot and change T to U
    X=currTABLE %>%
      # filter(TYPE == currTYPE )%>%
      select(contains("TILE"), contains("_NUC_"),TYPE)%>%
      pivot_longer(-c(contains("TILE"),TYPE))%>%
      separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
      mutate(name = case_when(
        str_detect(name, "NUC_T") ~ "NUC_U" ,
        TRUE ~ paste(NUC,NUCid,sep="_")
      ))%>%
      filter(TILEsize==currSIZE & NUCid==currNUC & STATtype == METHOD)
    
      MIN=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%min()*1.1
      MAX=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%max()*1.1
  
    
    #determine the maximum correlation
    maxLINE= X %>%
      group_by(TILEsize)%>%
      filter(value==max(value))%>%
      #round number
      mutate(value=round(value,2))
    
    #select the current nucleotide for the plot
    X = X %>% 
      mutate(
        case_when(
          nucTILEsize==min(nucTILEsize) & TILEshift==min(TILEshift) ~ minVAL,
          nucTILEsize==max(nucTILEsize) & TILEshift==max(TILEshift) ~ maxVAL,
          TRUE ~ value  
        )
      )
    
    #plot the graph
    p=X %>%
      ggplot()+
      # geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value) )+
      geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
      geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
      geom_point_rast(aes(y=750, x=-600),color="red")+
      geom_text(data = maxLINE,
            aes(y = nucTILEsize, x = TILEshift, label = value),
            color = "black",
            size = 2,
            vjust = 1.5) +  # Adjust position vertically
      # facet_wrap(~TYPE) +
      geom_vline(xintercept = 00, linetype =  3)+
      # geom_vline(xintercept = -420, linetype = 2)+
      geom_hline(yintercept = 100, linetype = 3)+
      labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
             y="window used to evaluate CLIP signal [nt]",
             x="start position shift relative to the piRNA tile [nt]"
      )+
      theme_cowplot(12)+
      # guides(fill = guide_legend(ncol = 1))+
      theme(
          # legend.position = "none",
          axis.text = element_text(size=12),
          axis.text.x = element_text(angle = 90, hjust = 1),
          axis.title = element_text(size=10),
          strip.text = element_text(size=10),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          aspect.ratio = 1
      )+
        scale_fill_scico(limits = c(minVAL, maxVAL),midpoint=0, palette = "vik", direction = 1 )
        # scale_fill_scico( palette = "vik", direction = 1 )
  
  
    print(p)
     if(currNUC=="T"){
       HEIGHT=100
        WIDTH=100
     }else{
       HEIGHT=100
       WIDTH=100
       dpi=100
     }
     ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".",currNUC,".",currTYPE,".pdf",sep=""), p, height = HEIGHT, width = WIDTH, units = "mm", dpi = 300)    

    if(currNUC=="T"){
      #plot the U graph with its own scale
      p=X %>%
        ggplot()+
        # geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value) )+
        geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
        geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
        geom_point_rast(aes(y=750, x=-600),color="red")+
        geom_text(data = maxLINE,
              aes(y = nucTILEsize, x = TILEshift, label = value),
              color = "black",
              size = 2,
              vjust = 1.5) +  # Adjust position vertically
        # facet_wrap(~TYPE) +
        geom_vline(xintercept = 00, linetype =  3)+
        # geom_vline(xintercept = -420, linetype = 2)+
        geom_hline(yintercept = 100, linetype = 3)+
        labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
               y="window used to evaluate CLIP signal [nt]",
               x="start position shift relative to the piRNA tile [nt]"
        )+
        theme_cowplot(12)+
        # guides(fill = guide_legend(ncol = 1))+
        theme(
            # legend.position = "none",
            axis.text = element_text(size=12),
            axis.text.x = element_text(angle = 90, hjust = 1),
            axis.title = element_text(size=10),
            strip.text = element_text(size=10),
            panel.grid.minor = element_blank(),
            panel.grid.major = element_blank(),
            aspect.ratio = 1
        )+
          scale_fill_scico( palette = "vik", direction = 1 )
          # scale_fill_scico( palette = "vik", direction = 1 )
    
    
      print(p)
       if(currNUC=="T"){
         HEIGHT=100
          WIDTH=100
       }else{
         HEIGHT=100
         WIDTH=100
         dpi=100
       }
       ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".",currNUC,".",currTYPE,".rescaled.pdf",sep=""), p, height = HEIGHT, width = WIDTH, units = "mm", dpi = 300)   
    }
     
  }
}

plot the biogenesisEFF heatmap - di-Nucleotide bias

#extract available statistical methods
STATs= RAW %>% 
    select(contains("TILE"), contains("_diNUC_"))%>%
    pivot_longer(-contains("TILE"))%>%
    separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
    select(STATtype)%>%
    unique()%>%
    unlist()

#fix certain parameters
currSIZE=100
METHOD="kendall"
BINWIDTH=0.025

#functions to round up or down to the nearest multiple
round_up_to <- function(x, multiple) {
  ceiling(x / multiple) * multiple
}
round_down_to <- function(x, multiple) {
  floor(x / multiple) * multiple
}

for (currTYPE in c("UTR")){
  
  currTABLE = RAW %>%
    filter(TYPE == currTYPE)
    {}

  #determine the limits for the current filter
  LIMITS=currTABLE %>%
    select(contains("TILE"), contains("_diNUC_"))%>%
    pivot_longer(-contains("TILE"))%>%
    separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
    filter(STATtype == METHOD & TILEsize==currSIZE )%>%
    summarise(minVAL=min(value), maxVAL=max(value))
  #round to the nearest binwidth
  maxVAL=round_up_to(LIMITS$maxVAL,BINWIDTH)
  minVAL=round_down_to(LIMITS$minVAL,BINWIDTH)
  BREAKS=seq(minVAL,maxVAL,by=BINWIDTH)
  
  
  p = currTABLE %>%
      select(contains("TILE"), contains("_diNUC_"),TYPE)%>%
       pivot_longer(-c(contains("TILE"),TYPE))%>%
      separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
      filter(TILEsize==currSIZE & STATtype == METHOD)%>%
      ggplot()+
      geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
      facet_wrap(~NUCid) +
      geom_vline(xintercept = 00, linetype =  3)+
      # geom_vline(xintercept = -420, linetype = 2)+
      geom_hline(yintercept = 100, linetype = 3)+
      labs(title=paste(currTYPE, METHOD, sep="_"),
             y="window used to evaluate CLIP signal [nt]",
             x="start position shift relative to the piRNA tile [nt]"
      )+
      theme_cowplot(12)+
      # guides(fill = guide_legend(ncol = 1))+
      theme(
          # legend.position = "none",
          axis.text = element_text(size=12),
          axis.text.x = element_text(angle = 90, hjust = 1),
          axis.title = element_text(size=10),
          strip.text = element_text(size=10),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          aspect.ratio = 1
      )+
        # scale_fill_scico(limits = c(MIN, MAX),midpoint=0, palette = "vik", direction = 1 )
        scale_fill_scico( palette = "vik", direction = 1 )
  
  ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".diNUC.",currTYPE,".pdf",sep=""), p, height = 600, width = 600, units = "mm", dpi = 300)    

  #loop through all the nucleotides to get individual plots
  for (currNUC in c("TT")){
    #extract the relevant data for the current plot and change T to U
    X=currTABLE %>%
      # filter(TYPE == currTYPE )%>%
      select(contains("TILE"), contains("_diNUC_"),TYPE)%>%
      pivot_longer(-c(contains("TILE"),TYPE))%>%
      separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
      mutate(name = case_when(
        str_detect(name, "NUC_TT") ~ "NUC_UU" ,
        TRUE ~ paste(NUC,NUCid,sep="_")
      ))%>%
      filter(TILEsize==currSIZE & NUCid==currNUC & STATtype == METHOD)
    
      MIN=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%min()*1.1
      MAX=currTABLE%>%
          select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
          pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
          filter(str_detect(name,METHOD))%>%
          select(value)%>%max()*1.1
  
    
    #determine the maximum correlation
    maxLINE= X %>%
      group_by(TILEsize)%>%
      filter(value==max(value))%>%
      #round number
      mutate(value=round(value,2))
    
    #select the current nucleotide for the plot
    X = X %>% 
      mutate(
        case_when(
          nucTILEsize==min(nucTILEsize) & TILEshift==min(TILEshift) ~ minVAL,
          nucTILEsize==max(nucTILEsize) & TILEshift==max(TILEshift) ~ maxVAL,
          TRUE ~ value  
        )
      )
    
    #plot the graph
    p=X %>%
      ggplot()+
      # geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value) )+
      geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
      geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
      geom_point_rast(aes(y=750, x=-600),color="red")+
      geom_text(data = maxLINE,
            aes(y = nucTILEsize, x = TILEshift, label = value),
            color = "black",
            size = 2,
            vjust = 1.5) +  # Adjust position vertically
      # facet_wrap(~TYPE) +
      geom_vline(xintercept = 00, linetype =  3)+
      # geom_vline(xintercept = -420, linetype = 2)+
      geom_hline(yintercept = 100, linetype = 3)+
      labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
             y="window used to evaluate CLIP signal [nt]",
             x="start position shift relative to the piRNA tile [nt]"
      )+
      theme_cowplot(12)+
      # guides(fill = guide_legend(ncol = 1))+
      theme(
          # legend.position = "none",
          axis.text = element_text(size=12),
          axis.text.x = element_text(angle = 90, hjust = 1),
          axis.title = element_text(size=10),
          strip.text = element_text(size=10),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          aspect.ratio = 1
      )+
        # scale_fill_scico(limits = c(MIN, MAX),midpoint=0, palette = "vik", direction = 1 )
        scale_fill_scico( palette = "vik", direction = 1 )
  
  
    print(p)
     if(currNUC=="T"){
       HEIGHT=100
        WIDTH=100
     }else{
       HEIGHT=100
       WIDTH=100
       dpi=100
     }
     ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".",currNUC,".",currTYPE,".pdf",sep=""), p, height = HEIGHT, width = WIDTH, units = "mm", dpi = 300)    
  }
}

plot correlation in line-plots

p = RAW %>% 
  filter(TYPE=="UTR")%>%
  select(contains("TILE"),kendall_NUC_T)%>%
  ggplot(aes(x=TILEshift, y=kendall_NUC_T, color=as.factor(nucTILEsize))) +
  geom_line() +
  geom_vline(xintercept = -600)+
  geom_line(data = . %>% filter(nucTILEsize == 750), color="black",size=2) +
  scale_color_scico_d(palette = "navia") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    aspect.ratio = 1,
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  ) +
{}

print(p)

ggsave("parameter-sweep.line-chart.pdf", p, width = 8, height = 5)

############################################################################################### ############################################################################################### ############################################################################################### # #################### tile-analysis analysis biogenesis-efficiency ################################### ############################################################################################### ############################################################################################### ###############################################################################################

load and prepare data

 # load data
RAW = read_tsv("biogEFF_tiles.100_-600_750.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  separate(CHR,c("ID","TYPE"), sep="_",remove=FALSE)%>%
  {}

process and filter data

# filter data
TABLEfilt= RAW %>%
  filter(TYPE == "UTR" | TYPE=="CLUSTER"| TYPE=="CDS" )%>%
  filter(!str_detect(CHR,"GL")) %>%
  select(-contains("ARMI"))%>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5 ) %>%
  {}

nLINES=nrow(TABLEfilt)


#calculate RNAseq/TTseq corrected smallRNA and CLIP counts
TABLEfilt <- TABLEfilt %>%
  mutate(
    across(
      starts_with(c("sRNAdata_")),
      ~.x / TTseq_RPKM ,
      .names = "TTnorm_{.col}"
    ),
    across(
      starts_with(c("sRNAdata_")),
      ~.x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    ),
    across(
      starts_with(c("nucREGION_CLIP_")),
      ~.x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    )
  )


#calculate YB-dependency
TABLEfilt <- TABLEfilt %>%
  mutate(
    YBdependency = `sRNAdata_average-WT`/`sRNAdata_average-YB`,
    YB_foldChange = `sRNAdata_average-YB` / `sRNAdata_average-WT`
    )%>%
  filter(!is.na(YBdependency),!is.na(`RNAnorm_sRNAdata_average-WT`),!is.infinite(YBdependency))%>%
  rename(TTnorm_sRNAdata_average_WT = `TTnorm_sRNAdata_average-WT`)%>%
  rename(RNAnorm_sRNAdata_average_WT = `RNAnorm_sRNAdata_average-WT`)%>%
  {}

add bins to the table

plotTABLEorig = TABLEfilt %>% 
  filter(TYPE=="UTR" | TYPE == "CDS" )%>%
  select(-contains("sRNAdata_average-ARMI"))%>%
  filter(RNAnorm_nucREGION_CLIP_CLIP_YB.all > 0.0005, if_all(starts_with("RNAnorm_sRNAdata_average"), ~ . > 0.00001) )%>%
  filter(!is.na(RNAnorm_sRNAdata_average_WT))%>%
  mutate(
    BIN2 =  as.factor(cut_interval(log10(RNAnorm_sRNAdata_average_WT), n=4, labels = FALSE))
  )%>%
  #rename column `TTnorm_sRNAdata_average-WT` to TTnorm_sRNAdata_average_WT
  {}

# Get bin breakpoints
bin_breaks <- plotTABLEorig %>%
  group_by(BIN2) %>%
  summarise(min_YBdep = min(RNAnorm_sRNAdata_average_WT, na.rm = TRUE)) %>%
  pull(min_YBdep)

# Remove first bin (we don’t need a line at the lowest boundary)
bin_breaks <- sort(bin_breaks[-1])

#determine number per bin
count_data <- plotTABLEorig %>%
  group_by(BIN2) %>%
  summarise(n = n())

correlation between 100nt tiles and YB-CLIP

YB-dependency vs CLIP in binned violins

#copy table to allow modifying the data
plotTABLEclip = plotTABLEorig


# Set seed for reproducible jittering
set.seed(123)
  
p = plotTABLEclip %>%
  ggplot(aes(x = TYPE, y = RNAnorm_sRNAdata_average_WT, color=TYPE)) +
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = 1 + 0.1, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = 1 - 0.1, color = TYPE),
    width      = 0.3,   # controls horizontal spread
    varwidth   = FALSE,  # fixed width
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +

  # Modified repel parameters to place labels to the right
  geom_hline(yintercept = bin_breaks, linetype = "dashed", color = "red", size = 1) +
  annotate("text", x = 1.4, y = bin_breaks, 
           label = round(bin_breaks, 3), hjust = 1, vjust = 2, color = "red") +
  scale_y_log10(breaks = scales::log_breaks()) + 
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  annotation_logticks(sides = "l", outside = TRUE) +
  coord_cartesian(clip = "off") +
  # Extend the plot area to make room for labels
  # coord_cartesian(clip = "off", xlim = c(0.5, 1.5)) +
  labs(x = "for-size", y = "Biogenesis Efficiench") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    # Add more right margin for labels if needed
    plot.margin = margin(5.5, 40, 5.5, 5.5, "pt")
  )

p

ggsave2("100nt-tiles-extNUC-UTR_BiogEFF_vs_CLIP.binning_on_violin.750-600.pdf", p, width = 2.5, height = 5)



# Compute maximum y-value per BIN2 + small offset
max_values <- plotTABLEclip %>%
  group_by(BIN2) %>%
  summarise(y_max = max(RNAnorm_nucREGION_CLIP_CLIP_YB.all, na.rm = TRUE) +1)# Increase by 10%

# Define group comparisons
comparisons <- list(c("1", "2"), c("2", "3"), c("3", "4"))

# Merge y_max values for p-value label positions
y_positions <- log10(max_values$y_max)[-1]

p2 = plotTABLEclip %>%
  ggplot(aes(x = BIN2, y = RNAnorm_nucREGION_CLIP_CLIP_YB.all)) +
  # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = as.numeric(BIN2) - 0.1, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
    # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = as.numeric(BIN2) + 0.1, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
  # geom_quasirandom_rast(size = 0.1, alpha = 1, linewidth = 0.3, draw_quantiles = c(0.5)) +
  # Add count labels
  # geom_text(data = count_data, aes(x = BIN2, y = 0.001, label = n), 
  #           color = "black", size = 3, vjust = 0) +

  # # Add median crossbars
  # stat_summary(
  #   fun = median, 
  #   geom = "crossbar", 
  #   width = 0.5, 
  #   fatten = 1.5, 
  #   color = "red"
  # ) +
  # Add p-values dynamically above max y-values
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  # stat_compare_means(comparisons = comparisons, 
  #                    method = "wilcox.test", 
  #                    label = "p.signif",label.y = y_positions) +  # Use computed y-positions
  scale_y_log10(breaks = scales::log_breaks(), labels = scales::label_number()) + 
  annotation_logticks(sides = "l", outside=TRUE) +
  coord_cartesian(clip = "off") +
  labs(y = "RNAnorm YB-CLIP", x= "Biogenesis Efficiency bins") +
  theme_cowplot(14) +
  theme(
    # aspect.ratio = 1,
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    plot.margin = margin(5.5, 40, 5.5, 5.5, "pt")
  )
p2

ggsave2("100nt-tiles-extNUC_BiogEFF_vs_CLIP.binned_violin.RNAnorm.750-600.pdf", p2, width = 3, height = 5)


y=ggarrange(p,p2,common.legend = TRUE)
print(y)

plot BiogEFF vs nucleotide content

scatter-plot BiogEFF vs nucleotide content

# Copy table to allow modifying the data
plotTABLEnuc = plotTABLEorig %>% 
  select(FBgn, RNAnorm_sRNAdata_average_WT, starts_with("NUC_"), BIN2, TYPE) 

# # Get bin breakpoints
# bin_breaks <- plotTABLEnuc %>%
#   group_by(BIN2) %>%
#   summarise(min_YBdep = min(RNAnorm_sRNAdata_average_WT, na.rm = TRUE)) %>%
#   pull(min_YBdep)
# 
# # Remove first bin (we don’t need a line at the lowest boundary)
# bin_breaks <- sort(bin_breaks[-1])
# 
# # Reshape the data into long format
# plotTABLEnuc_long <- plotTABLEnuc %>%
#   pivot_longer(-c(TYPE,FBgn, RNAnorm_sRNAdata_average_WT, BIN2), names_to = "Nucleotide", values_to = "value") %>%
#   mutate(Nucleotide = gsub("fullLENGTH_NUC_", "", Nucleotide))  # Clean column names
# 
# # Compute statistics per nucleotide (NO log10 transformation)
# stats_nuc <- plotTABLEnuc_long %>%
#   group_by(Nucleotide) %>%
#   summarise(
#     cor_kendall = cor(RNAnorm_sRNAdata_average_WT, value, method = "kendall"),
#     p_kendall = cor.test(RNAnorm_sRNAdata_average_WT, value, method = "kendall")$p.value,
#     
#     # Linear model (NO log transformation)
#     lm_model = list(lm(value ~ log10(RNAnorm_sRNAdata_average_WT))),
#     slope_lm = coef(lm(value ~ log10(RNAnorm_sRNAdata_average_WT)))[2],
#     r_squared_lm = summary(lm(value ~ log10(RNAnorm_sRNAdata_average_WT)))$r.squared,
#     p_lm = summary(lm(value ~ log10(RNAnorm_sRNAdata_average_WT)))$coefficients[2, 4],
#     
#     # GAM Model (NO log transformation)
#     gam_model = list(gam(value ~ s(RNAnorm_sRNAdata_average_WT, bs = "cs"))),
#     r_squared_gam = summary(gam(value ~ s(RNAnorm_sRNAdata_average_WT, bs = "cs")))$r.sq,
#     edf_gam = summary(gam(value ~ s(RNAnorm_sRNAdata_average_WT, bs = "cs")))$s.table[1, "edf"],
#     p_gam = summary(gam(value ~ s(RNAnorm_sRNAdata_average_WT, bs = "cs")))$s.table[1, "p-value"]
#   ) %>%
#   mutate(
#     p_kendall = formatC(p_kendall, format = "e", digits = 2),
#     p_lm = formatC(p_lm, format = "e", digits = 2),
#     p_gam = formatC(p_gam, format = "e", digits = 2),
#     slope_lm = round(slope_lm, 3),
#     r_squared_lm = round(r_squared_lm, 3),
#     r_squared_gam = round(r_squared_gam, 3),
#     edf_gam = round(edf_gam, 2),
#     cor_kendall = round(cor_kendall, 3)
#   )
# 
# # Merge statistics with long-form data for annotation
# plotTABLEnuc_long <- plotTABLEnuc_long %>%
#   left_join(stats_nuc, by = "Nucleotide")
# 
# # Create a function to generate annotation labels
# generate_labels <- function(df) {
#   paste0(
#     "LM\nSlope = ", df$slope_lm, "\nR² = ", df$r_squared_lm, "\nP = ", df$p_lm, "\n\n",
#     "GAM\nR² = ", df$r_squared_gam, "\nEDF = ", df$edf_gam, "\nP = ", df$p_gam, "\n\n",
#     "Kendall's τ = ", df$cor_kendall, "\nP = ", df$p_kendall
#   )
# }
# 
# # Plot (NO log scale on y-axis)
# x = plotTABLEnuc_long %>%
#   ggplot(aes(x=RNAnorm_sRNAdata_average_WT, y=value)) +
#   geom_vline(xintercept=1, linetype="dashed", color="darkgrey")+
#   geom_point_rast(size=0.5, alpha=0.3) +
#   geom_smooth(se = TRUE, formula = y ~ s(x, bs = "cs")) +
#   geom_smooth(se = TRUE, method = "lm") +
#   facet_wrap(~Nucleotide) +
#   geom_text(data = stats_nuc, aes(x = min(plotTABLEnuc$RNAnorm_sRNAdata_average_WT), 
#                                   y = max(plotTABLEnuc_long$value, na.rm = TRUE),
#                                   label = generate_labels(stats_nuc)), 
#             hjust = 0, size = 3, vjust = 1) + 
#   # geom_vline(xintercept = bin_breaks, linetype="dotted", color="red") +  # Bin separators
#   scale_x_log10(breaks = scales::log_breaks()) + 
#   scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
#   annotation_logticks(sides = "b", outside=TRUE) +
#   coord_cartesian(clip = "off") +
#   labs(x="Biogenesis Efficiency", y="[%] Nucleotide") +
#   theme_cowplot(14) +
#   theme(
#     axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
#     axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
#     aspect.ratio = 1,
#     panel.grid.minor = element_blank(),
#     panel.grid.major = element_blank(),
#     legend.position = "none",
#     
#     # Black border on facet strips
#     strip.background = element_rect(fill = "white", color = "black", size = 1), 
#     strip.text = element_text(face = "bold", color = "black")
#   )
# 
# 
# # Save 
# print(x)
# ggsave2("100nt-tiles-extNUC_BiogEFF_vs_NUC.scatter.RNAnorm.pdf", x, width = 10, height = 10)

binned-violins BiogEFF vs nucleotide content

plotTABLEnuc2 = plotTABLEnuc %>%
  pivot_longer(-c(FBgn, RNAnorm_sRNAdata_average_WT, BIN2, TYPE), names_to = "name", values_to = "value")%>%
  separate(name, c("xy", "name"), sep = "_")%>%
  group_by(name, BIN2) 

count_data <- plotTABLEnuc2 %>%
  summarise(n = n())
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
max_values <- plotTABLEnuc2 %>%
  group_by(BIN2) %>%
  summarise(group_max = max(value, na.rm = TRUE)) %>%
  mutate(
    y_max = pmax(
      group_max,
      lead(group_max, default = -Inf),  # compare with next group
      lag(group_max, default = -Inf)    # compare with previous group
    ) +2  # Add 2% offset
  )

# Define group comparisons
comparisons <- list(c("1", "2"), c("2", "3"), c("3", "4"))

# Merge y_max values for p-value label positions
y_positions <- max_values$y_max[-c(1)]   # Adjust y-positions for p-values


p2 = plotTABLEnuc2 %>%
  ggplot(aes( y = value)) +
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = as.double(BIN2) - 0.2, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
    # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = as.double(BIN2) + 0.2, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
    # Add count labels
  facet_row(~name)+
  geom_text(data = count_data %>% filter(name=="T"), aes(x = as.numeric(BIN2), y = 8, label = n), 
            color = "black", size = 3, vjust = 0) +

  # Add median crossbars
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  # stat_summary(
  #   fun = median, 
  #   geom = "crossbar", 
  #   width = 0.75, 
  #   fatten = 1.5, 
  #   color = "red"
  # ) +
  # Add p-values dynamically above max y-values
  # stat_compare_means(comparisons = comparisons, 
  #                    method = "wilcox.test", 
  #                    label = "p.signif",label.y = y_positions) +  # Use computed y-positions
  # 
  labs(y = "[%] Nucleotide", x= "Biogenesis Efficiency bins") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  )
p2

ggsave2("100nt-tiles-extNUC_vs_allNUC.binned_quasirandom.RNAnorm.750-600.pdf", p2, width = 8, height = 5)



p2 = plotTABLEnuc2 %>%
  filter(name=="T")%>%
  ggplot(aes( y = value)) +
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "UTR"),
    aes(x = as.double(BIN2) - 0.2, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
    # UTR slightly to the right
  geom_quasirandom_rast(
    data = . %>% filter(TYPE == "CDS"),
    aes(x = as.double(BIN2) + 0.2, color = TYPE),
    width      = 0.3,
    varwidth   = FALSE,
    groupOnX   = FALSE,
    size       = 0.01,
    alpha      = 1
  ) +
    # Add count labels
  geom_text(data = count_data %>% filter(name=="T"), aes(x = as.numeric(BIN2), y = 8, label = n), 
            color = "black", size = 3, vjust = 0) +

  # Add median crossbars
  scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) +  # Add this line
  # stat_summary(
  #   fun = median, 
  #   geom = "crossbar", 
  #   width = 0.75, 
  #   fatten = 1.5, 
  #   color = "red"
  # ) +
  # Add p-values dynamically above max y-values
  # stat_compare_means(comparisons = comparisons, 
  #                    method = "wilcox.test", 
  #                    label = "p.signif",label.y = y_positions) +  # Use computed y-positions
  # 
  labs(y = "[%] Nucleotide", x= "Biogenesis Efficiency bins") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  )
p2

ggsave2("100nt-tiles-extNUC_vs_U.binned_quasirandom.RNAnorm.750-600.pdf", p2, width = 3, height = 5)

kmer - analysis

XY = plotTABLEorig %>% 
  select(FBgn, TYPE, starts_with("triNUC"),BIN2)%>%
  pivot_longer(-c(FBgn, TYPE, BIN2), names_to = "NUC", values_to = "value")%>%
  mutate(
    BIN2 = case_when(
      BIN2 == 2 ~ "1",
      BIN2 == 4 ~ "3",
      TRUE ~ BIN2
    )
  )%>%
  group_by(BIN2,NUC )%>%
  summarise(
    MEAN = mean(value, na.rm = TRUE)
  )%>%
  pivot_wider(names_from = BIN2, values_from = MEAN)%>%
  mutate(
    RATIO = `3`/`1`
  )%>%
  separate(NUC, c("name", "kmer"), sep = "_", remove = FALSE)%>%
    #add category that determines the number of Ts per motif 
    mutate(
      nT = str_count(kmer, "T"),
      nT = case_when(
        nT == 0 ~ "0T",
        nT == 1 ~ "1T",
        nT == 2 ~ "2T",
        nT == 3 ~ "3T",
        TRUE ~ "4T"
      )
    )
  
# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
#invert vector

graph_colors <- c(okabe_ito[1:3])
graph_colors = c(rev(graph_colors), "black")

  p = XY %>%
    mutate(jitter_x = jitter(rep(1, n()), amount = 0.1),
           name = as.factor(name)) %>% # Add explicit jitter
    ggplot(aes(x = jitter_x, y = RATIO, label = kmer, color = nT)) +
      # geom_point_rast() + # Use geom_point_rast instead of geom_quasirandom
      geom_quasirandom_rast() +
      geom_label_repel(data = .%>% filter(RATIO>=1.1), min.segment.length = 0, box.padding = 0.2, size = 3, max.overlaps = 100) +
      geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
      labs(x = "Jittered X-axis", y = "Enrichment") +
  theme_cowplot(14) +
  theme(
    axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
    axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    legend.position = "none",
    
    # Black border on facet strips
    strip.background = element_rect(fill = "white", color = "black", size = 1), 
    strip.text = element_text(face = "bold", color = "black")
  )+
    scale_color_manual(values = graph_colors) 
  
  print(p)

ggsave2("100nt-tiles-extNUC_kmer-enrichment.pdf", p, width = 3, height = 5)

############################################################################################### ############################################################################################### ############################################################################################### # #################### train model - for extended nucleotide tiles ################################### ############################################################################################### ############################################################################################### ###############################################################################################

load and prepare data

 # load data
# RAW = read_tsv("biogEFF_tiles.100_-600_750.txt", col_names = TRUE)%>%
RAW = read_tsv("tiles.100_-600_750_quantified.allLIBs.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  separate(CHR,c("ID","TYPE"), sep="_",remove=FALSE)%>%
  mutate(
    across(
      starts_with(c("sRNAdata_")),
      ~ .x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    )
  )%>%
  filter(!is.na(`RNAnorm_sRNAdata_average-WT`),!is.infinite(`RNAnorm_sRNAdata_average-WT`))%>%
  {}

NUCclass="diNUC_"

process and filter data

# filter data
TABLEfilt= RAW %>%
  filter(TYPE == "UTR" )%>%
  filter(!str_detect(CHR,"GL")) %>%
  dplyr::select(-contains("ARMI"))%>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>1,TTseq_RPKM>1 ) %>%
  {}

nLINES=nrow(TABLEfilt)


#calculate YB-dependency
TABLEfilt <- TABLEfilt %>%
  mutate(
    sRNAdata_average_WT = log10(`sRNAdata_average-WT_CLUSTERuniq`),
    # sRNAdata_average_WT = `sRNAdata_average-WT_CLUSTERuniq`,
    RNAnorm_sRNAdata_average_WT = `RNAnorm_sRNAdata_average-WT_CLUSTERuniq`,
    # across(
    #   starts_with(c("NUC_")),
    #   ~ log10(.x + 1),  # Add 1 to avoid log(0)
    #   .names = "{.col}"
    # ),
     # RNAseq_RPKM = log10(RNAseq_RPKM),
     # TTseq_RPKM = log10(TTseq_RPKM)
    )%>%
  filter(!is.na(sRNAdata_average_WT),!is.na(sRNAdata_average_WT),!is.infinite(sRNAdata_average_WT))%>%
  {}
# Load the library


# Start the H2O cluster
# h2o.no_progress()
h2o.init(max_mem_size = "20g", nthreads = -1)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 days 8 hours 
##     H2O cluster timezone:       Europe/Vienna 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.46.0.7 
##     H2O cluster version age:    10 months and 12 days 
##     H2O cluster name:           H2O_started_from_R_dominik.handler_fuo893 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   17.38 GB 
##     H2O cluster total cores:    11 
##     H2O cluster allowed cores:  11 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.2 (2023-10-31)
h2o.removeAll()


#random splitting
VARI="sRNAdata_average_WT"


quantiles <- quantile(TABLEfilt$RNAseq_RPKM, probs = c(0.75, 0.90, 0.95))

TABLEfilt$WEIGHT <- ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[3], 10,    # Top 5%: weight = 10
                              ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[2], 5,     # Top 10%: weight = 5
                              ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[1], 2,     # Top 25%: weight = 2
                                     1)))                                        # Others: weight = 1

scaled_RNA <- (TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq` - min(TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq`)) / 
                   (max(TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq`) - min(TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq`))*1.4
TABLEfilt$WEIGHT =  100^scaled_RNA

max_smallRNA <- max(TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq`)
TABLEfilt$WEIGHT <- 100^(1/((max_smallRNA + 1) / (TABLEfilt$`sRNAdata_average-WT_CLUSTERuniq` + 1)))

p=TABLEfilt %>% 
  ggplot(aes(x=`sRNAdata_average-WT_CLUSTERuniq`, y=WEIGHT)) +
  geom_point() 

print(p)

#splitting keeping genes together
TABLEmodel = TABLEfilt%>%
  filter(TYPE=="UTR",nPOS>0)%>%
  select(FBgn, VARI, starts_with(NUCclass),TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT,UTR_LENGTH,nPOS)


set.seed(1)                                                 # reproducible
unique_chr <- unique(TABLEmodel$CHR)

train_chr <- sample(unique_chr,
                    size = floor(0.8 * length(unique_chr)))  # ≈ 75 % of chromosomes
residual_chr <- setdiff(unique_chr, train_chr)  # Remaining chromosomes
val_chr <- sample(residual_chr,
                    size = floor(0.5 * length(residual_chr)))  # ≈ 75 % of chromosomes

test_chr  <- setdiff(residual_chr, val_chr)

# --- 3.  Split the rows on those chromosome sets -------------------------
train_df <- TABLEmodel %>% filter(CHR %in% train_chr)
val_df <- TABLEmodel %>% filter(CHR %in% val_chr)
test_df  <- TABLEmodel %>% filter(CHR %in% test_chr)

# --- 4.  Drop CHR (if you don’t want it as a feature) and send to H2O ----
train <- as.h2o(train_df %>% select(-CHR), destination_frame = "train.hex")
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
val  <- as.h2o(val_df  %>% select(-CHR), destination_frame = "val.hex")
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
test  <- as.h2o(test_df  %>% select(-CHR), destination_frame = "test.hex")
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# --- 5.  (Optional) inspect sizes ----------------------------------------
n_train <- nrow(train)
n_val <- nrow(val)
n_test  <- nrow(test)

percent_train = round(n_train *100 / sum(n_train, n_val, n_test),2)
percent_val = round(n_val *100 / sum(n_train, n_val, n_test),2)
percent_test = round(n_test *100 / sum(n_train, n_val, n_test),2)

h2o.dim(train)
## [1] 15298    23
h2o.dim(val)
## [1] 1961   23
h2o.dim(test)
## [1] 1892   23
a=as.data.frame(train) %>%
  ggplot(aes(x=!!sym(VARI)))+
  geom_histogram()+
  # scale_x_log10(limits=c(1,1000))+
  annotate("text", x = 0, y = 200, label =paste("n=",n_train,"\n[%]=",percent_train,sep=""))
b=as.data.frame(test)%>%
  ggplot(aes(x=!!sym(VARI)))+
  geom_histogram()+
  # scale_x_log10(limits=c(1,1000))+
  annotate("text", x = 0, y = 200, label =paste("n=",n_test,"\n[%]=",percent_test,sep=""))
c=as.data.frame(val) %>%
  ggplot(aes(x=!!sym(VARI)))+
  geom_histogram()+
  # scale_x_log10(limits=c(1,1000))+
  annotate("text", x = 0, y = 200, label =paste("n=",n_val,"\n[%]=",percent_val,sep=""))


# print( ggarrange( a,b,c, ncol=1))
PLOT=ggarrange( a,b,c, ncol=1)
print(PLOT)

# ggsave(PLOT, filename = "split-distribution.pdf", width = 6, height = 8) 
# Define the four scenarios
scenarios <- list(
  all_vari = list(name = "Base Model with ALL Variables",
                exclude = c()),
  no_nuc = list(name = "Without Nucleotide_Content",
                exclude = c("diNUC_", "triNUC_", "tetraNUC_")),
  no_rna = list(name = "Without RNAseq",
                exclude = c("RNAseq_RPKM")),
  no_tt = list(name = "Without TTseq",
               exclude = c("TTseq_RPKM")),
  no_rna_tt = list(name = "Without RNAseq and TTseq",
                   exclude = c("RNAseq_RPKM", "TTseq_RPKM")  ),
  no_pos = list(name = "Without Positional information",
                   exclude = c("nPOS"))
)


# Store results
results_list <- list()
plot_list <- list()

# Loop through each scenario
for (scenario_name in names(scenarios)) {
  
  cat("\n\n========================================\n")
  cat("Training model:", scenarios[[scenario_name]]$name, "\n")
  cat("========================================\n\n")
  
  # Define variables
  y <- VARI
  x <- setdiff(names(train), y)
  # x <- x[!grepl("nPOS", x)]
  # x <- x[!grepl("UTR_LENGTH", x)]
  
  # Remove FBgn and WEIGHT
  x <- x[!grepl("FBgn", x)]
  
  # Remove features based on scenario
  for (pattern in scenarios[[scenario_name]]$exclude) {
    x <- x[!grepl(pattern, x)]
  }
  
  cat("Features used:", length(x), "\n")
  cat("Features:", paste(x, collapse = ", "), "\n\n")
  
  #if no saved model esixts train new one
  if (file.exists(paste0("model_ablation_study_",scenario_name))) {
    yb_ml <- h2o.loadModel(paste0("model_ablation_study_",scenario_name))
    cat("Loaded existing model for scenario:", scenario_name, "\n\n")
    next
  }else{
    # Train model
    yb_ml_scenario <- h2o.automl(
      x = x, y = y,
      training_frame = train,
      max_models = 30,
      seed = 3242,
      nfolds = 0,
      validation_frame = val,
      weights_column = "WEIGHT",
      leaderboard_frame = test,
      include_algos = c("GBM"),
      stopping_metric = "RMSE",
      sort_metric = "RMSE"
    )
    
    # Get leaderboard and determine best model by CCC
    leaderboard <- as.data.frame(yb_ml_scenario@leaderboard)
    ccc_vec <- sapply(leaderboard$model_id, function(mid) {
      model <- h2o.getModel(mid)
      pred <- h2o.predict(model, test)
      actual <- 10^as.vector(test[[y]])
      predicted <- 10^as.vector(pred)
      ccc_value <- CCC(predicted, actual)
      ccc_value <- round(unlist(ccc_value[1])[1], 5)
      ccc_value
    })
    
    leaderboard$CCC <- ccc_vec
    leaderboard <- leaderboard[order(-leaderboard$CCC), ]
    LEADER <- leaderboard[1, 1]
    yb_ml <- h2o.getModel(LEADER)
    
    #safe model
    h2o.saveModel(filename=paste0("model_ablation_study_",scenario_name),object = yb_ml, path = getwd(), force = TRUE)
  }
  
  # Store model and leaderboard
  results_list[[scenario_name]] <- list(
    model = yb_ml
  )
  
  # Variable importance plot
  varimp_data <- h2o.varimp(yb_ml)
  p_varimp <- varimp_data %>%
    ggplot(aes(x = reorder(variable, scaled_importance), y = scaled_importance)) +
    geom_col() +
    coord_flip() +
    ggtitle(scenarios[[scenario_name]]$name) +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 12)
    )
  
  print(p_varimp)
  ggsave(p_varimp, filename = paste0("variable-importance-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
  
  # Predictions on test set (log scale)
  pred <- h2o.predict(yb_ml, test)
  pred_dataFIN <- as_tibble(test) %>%
    bind_cols(PRED = as.vector(pred))
  
  correlation <- lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_WT) %>% 
    summary() %>% .$r.squared
  CCC_value <- CCC(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED)
  PEAR <- cor.test(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED, method = "pearson")$estimate
  SPEAR <- SpearmanRho(pred_dataFIN$sRNAdata_average_WT, pred_dataFIN$PRED)
  
  p_log <- pred_dataFIN %>%
    ggplot(aes(x = sRNAdata_average_WT, y = PRED)) +
    geom_point_rast(size = 0.5, alpha = 0.3) +
    geom_density_2d(color = "red", alpha = 0.5) +
    geom_smooth(method = "lm") +
    geom_abline(intercept = 0, slope = 1) +
    scale_x_continuous(limits = c(-1, 3.5)) +
    scale_y_continuous(limits = c(-1, 3.5)) +
    annotate("text", x = -1, y = 2,
             label = paste("R2: ", round(correlation, 2), "\n", 
                          "CCC=", round(unlist(CCC_value[1])[1], 2), "\n",
                          "PEAR=", round(PEAR, 2), "\n",
                          "SPEAR=", round(SPEAR, 2), "\n",
                          paste("n=", nrow(pred_dataFIN), sep = ""), sep = ""),
             hjust = 0) +
    ggtitle(paste(scenarios[[scenario_name]]$name, "(log scale)")) +
    theme_bw() +
    scale_color_viridis_c() +
    theme(
      panel.grid.major = element_blank(),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 12),
      panel.grid.minor = element_blank(),
      aspect.ratio = 1
    )
  
  print(p_log)
  ggsave(p_log, filename = paste0("Prediction_vs_real_log-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
  
  # Predictions on test set (original scale)
  pred_dataFIN_orig <- pred_dataFIN %>%
    mutate(
      sRNAdata_average_WT = 10^sRNAdata_average_WT,
      PRED = 10^PRED
    )
  
  correlation_orig <- lm(pred_dataFIN_orig$PRED ~ pred_dataFIN_orig$sRNAdata_average_WT) %>% 
    summary() %>% .$r.squared
  CCC_value_orig <- CCC(pred_dataFIN_orig$sRNAdata_average_WT, pred_dataFIN_orig$PRED)
  PEAR_orig <- cor.test(pred_dataFIN_orig$sRNAdata_average_WT, pred_dataFIN_orig$PRED, method = "pearson")$estimate
  SPEAR_orig <- SpearmanRho(pred_dataFIN_orig$sRNAdata_average_WT, pred_dataFIN_orig$PRED)
  
  p_orig <- pred_dataFIN_orig %>%
    ggplot(aes(x = sRNAdata_average_WT, y = PRED)) +
    geom_point_rast(size = 0.5, alpha = 0.3) +
    geom_density_2d(color = "red", alpha = 0.5) +
    geom_smooth(method = "lm") +
    geom_abline(intercept = 0, slope = 1) +
    scale_x_log10(limits = c(0.01, 5000)) +
    scale_y_log10(limits = c(0.01, 5000)) +
    annotate("text", x = 0.01, y = 3,
             label = paste("R2: ", round(correlation_orig, 2), "\n",
                          "CCC=", round(unlist(CCC_value_orig[1])[1], 2), "\n",
                          "PEAR=", round(PEAR_orig, 2), "\n",
                          "SPEAR=", round(SPEAR_orig, 2), "\n",
                          paste("n=", nrow(pred_dataFIN_orig), sep = ""), sep = ""),
             hjust = 0) +
    ggtitle(paste(scenarios[[scenario_name]]$name, "(original scale)")) +
    theme_bw() +
    scale_color_viridis_c() +
    theme(
      panel.grid.major = element_blank(),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 12),
      panel.grid.minor = element_blank(),
      aspect.ratio = 1
    )
  
  print(p_orig)
  ggsave(p_orig, filename = paste0("Prediction_vs_real_orig-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
  
  # Store plots
  plot_list[[scenario_name]] <- list(
    varimp = p_varimp,
    log_scale = p_log,
    orig_scale = p_orig
  )
  
  # Store metrics
  results_list[[scenario_name]]$metrics <- list(
    log_scale = list(R2 = correlation, CCC = unlist(CCC_value[1])[1], 
                     PEAR = PEAR, SPEAR = SPEAR),
    orig_scale = list(R2 = correlation_orig, CCC = unlist(CCC_value_orig[1])[1],
                      PEAR = PEAR_orig, SPEAR = SPEAR_orig)
  )
}
## 
## 
## ========================================
## Training model: Base Model with ALL Variables 
## ========================================
## 
## Features used: 21 
## Features: diNUC_GT, diNUC_CG, diNUC_AA, diNUC_GA, diNUC_TA, diNUC_TG, diNUC_TT, diNUC_CC, diNUC_GG, diNUC_AC, diNUC_TC, diNUC_AG, diNUC_CA, diNUC_CT, diNUC_GC, diNUC_AT, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS 
## 
## Loaded existing model for scenario: all_vari 
## 
## 
## 
## ========================================
## Training model: Without Nucleotide_Content 
## ========================================
## 
## Features used: 5 
## Features: TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS 
## 
## Loaded existing model for scenario: no_nuc 
## 
## 
## 
## ========================================
## Training model: Without RNAseq 
## ========================================
## 
## Features used: 20 
## Features: diNUC_GT, diNUC_CG, diNUC_AA, diNUC_GA, diNUC_TA, diNUC_TG, diNUC_TT, diNUC_CC, diNUC_GG, diNUC_AC, diNUC_TC, diNUC_AG, diNUC_CA, diNUC_CT, diNUC_GC, diNUC_AT, TTseq_RPKM, WEIGHT, UTR_LENGTH, nPOS 
## 
## Loaded existing model for scenario: no_rna 
## 
## 
## 
## ========================================
## Training model: Without TTseq 
## ========================================
## 
## Features used: 20 
## Features: diNUC_GT, diNUC_CG, diNUC_AA, diNUC_GA, diNUC_TA, diNUC_TG, diNUC_TT, diNUC_CC, diNUC_GG, diNUC_AC, diNUC_TC, diNUC_AG, diNUC_CA, diNUC_CT, diNUC_GC, diNUC_AT, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS 
## 
## Loaded existing model for scenario: no_tt 
## 
## 
## 
## ========================================
## Training model: Without RNAseq and TTseq 
## ========================================
## 
## Features used: 19 
## Features: diNUC_GT, diNUC_CG, diNUC_AA, diNUC_GA, diNUC_TA, diNUC_TG, diNUC_TT, diNUC_CC, diNUC_GG, diNUC_AC, diNUC_TC, diNUC_AG, diNUC_CA, diNUC_CT, diNUC_GC, diNUC_AT, WEIGHT, UTR_LENGTH, nPOS 
## 
## Loaded existing model for scenario: no_rna_tt 
## 
## 
## 
## ========================================
## Training model: Without Positional information 
## ========================================
## 
## Features used: 20 
## Features: diNUC_GT, diNUC_CG, diNUC_AA, diNUC_GA, diNUC_TA, diNUC_TG, diNUC_TT, diNUC_CC, diNUC_GG, diNUC_AC, diNUC_TC, diNUC_AG, diNUC_CA, diNUC_CT, diNUC_GC, diNUC_AT, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH 
## 
## Loaded existing model for scenario: no_pos
# Print summary comparison
cat("\n\n========================================\n")
## 
## 
## ========================================
cat("SUMMARY COMPARISON\n")
## SUMMARY COMPARISON
cat("========================================\n\n")
## ========================================
# for (scenario_name in names(scenarios)) {
#   cat(scenarios[[scenario_name]]$name, ":\n")
#   cat("  Log scale - R2:", round(results_list[[scenario_name]]$metrics$log_scale$R2, 3),
#       "CCC:", round(results_list[[scenario_name]]$metrics$log_scale$CCC, 3), "\n")
#   cat("  Orig scale - R2:", round(results_list[[scenario_name]]$metrics$orig_scale$R2, 3),
#       "CCC:", round(results_list[[scenario_name]]$metrics$orig_scale$CCC, 3), "\n\n")
# }
scenario_name="all_vari"
yb_ml <- h2o.loadModel(paste0("model_ablation_study_",scenario_name))
cat("Loaded existing model for scenario:", scenario_name, "\n\n")
## Loaded existing model for scenario: all_vari
exm <- h2o.explain(yb_ml, test, top_n_features=5)
exm
## 
## 
## Residual Analysis
## =================
## 
## > Residual Analysis plots the fitted values vs residuals on a test dataset. Ideally, residuals should be randomly distributed. Patterns in this plot can indicate potential problems with the model selection, e.g., using simpler model than necessary, not accounting for heteroscedasticity, autocorrelation, etc. Note that if you see "striped" lines of residuals, that is an artifact of having an integer valued (vs a real valued) response variable.

## 
## 
## Learning Curve Plot
## ===================
## 
## > Learning curve plot shows the loss function/metric dependent on number of iterations or trees for tree-based algorithms. This plot can be useful for determining whether the model overfits.

## 
## 
## Variable Importance
## ===================
## 
## > The variable importance plot shows the relative importance of the most important variables in the model.

## 
## 
## SHAP Summary
## ============
## 
## > SHAP summary plot shows the contribution of the features for each instance (row of data). The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function.

## 
## 
## Partial Dependence Plots
## ========================
## 
## > Partial dependence plot (PDP) gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. PDP assumes independence between the feature for which is the PDP computed and the rest.

## 
## 
## Individual Conditional Expectations
## ===================================
## 
## > An Individual Conditional Expectation (ICE) plot gives a graphical depiction of the marginal effect of a variable on the response. ICE plots are similar to partial dependence plots (PDP); PDP shows the average effect of a feature while ICE plot shows the effect for a single instance. This function will plot the effect for each decile. In contrast to the PDP, ICE plots can provide more insight, especially when there is stronger feature interaction.

## test on CDS

# filter data
TABLEfiltCDS= RAW %>%
  filter(TYPE == "CDS"  )%>%
  filter(!str_detect(CHR,"GL")) %>%
  dplyr::select(-contains("ARMI"))%>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>10,TTseq_RPKM>1  ) %>%
  {}

nLINES=nrow(TABLEfilt)


#calculate YB-dependency
TABLEfiltCDS <- TABLEfiltCDS %>%
  mutate(
    sRNAdata_average_WT = log10(`sRNAdata_average-WT_CLUSTERuniq`),
    # sRNAdata_average_WT = `sRNAdata_average-WT_CLUSTERuniq`,
    RNAnorm_sRNAdata_average_WT = `RNAnorm_sRNAdata_average-WT_CLUSTERuniq`,
    # across(
    #   starts_with(c("NUC_")),
    #   ~ log10(.x + 1),  # Add 1 to avoid log(0)
    #   .names = "{.col}"
    # ),
     # RNAseq_RPKM = log10(RNAseq_RPKM),
     # TTseq_RPKM = log10(TTseq_RPKM)
    )%>%
  filter(!is.na(sRNAdata_average_WT),!is.na(sRNAdata_average_WT),!is.infinite(sRNAdata_average_WT))%>%
  {}


TABLEmodelCDS = TABLEfiltCDS%>%
  filter(TYPE=="CDS",nPOS>0)%>%
  select(FBgn, VARI, starts_with(NUCclass),TTseq_RPKM,CHR,RNAseq_RPKM,UTR_LENGTH,nPOS)



# --- 3.  Split the rows on those chromosome sets -------------------------
test_chr_CDS = gsub("UTR","CDS", test_chr)
test_df_CDS  <- TABLEmodelCDS %>% filter(CHR %in% test_chr_CDS)

# --- 4.  Drop CHR (if you don’t want it as a feature) and send to H2O ----
test_CDS  <- as.h2o(test_df_CDS  %>% select(-CHR), destination_frame = "test.hex")
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
# --- 5.  (Optional) inspect sizes ----------------------------------------
n_test_CDS  <- nrow(test_CDS)


#load all variable model
yb_ml <- h2o.loadModel(paste("model_ablation_study_all_vari",sep=""))


  # Predictions on test set (log scale)
  pred_CDS <- h2o.predict(yb_ml, test_CDS)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
  pred_CDS_dataFIN <- as_tibble(test_CDS) %>%
    bind_cols(PRED = as.vector(pred_CDS))
  
  correlation <- lm(pred_CDS_dataFIN$PRED ~ pred_CDS_dataFIN$sRNAdata_average_WT) %>% 
    summary() %>% .$r.squared
  CCC_value <- CCC(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED)
  PEAR <- cor.test(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED, method = "pearson")$estimate
  SPEAR <- SpearmanRho(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED)

  
  
  #calculate the offset calculation
  #here I switch to Real ~ Pred 
  fit <- lm( sRNAdata_average_WT ~PRED , data = pred_CDS_dataFIN)
  intercept <- fit$coefficients[1]
  slope <- fit$coefficients[2]  
  correction=mean(pred_CDS_dataFIN$sRNAdata_average_WT - pred_CDS_dataFIN$PRED)
  
  #calculate data to plot model in ggplot
  newdata <- data.frame(PRED = seq(min(pred_CDS_dataFIN$PRED), max(pred_CDS_dataFIN$PRED), length.out = 100))
  newdata$x_pred <- predict(fit, newdata)
  
  p_log <- pred_CDS_dataFIN %>%
    ggplot(aes(x = sRNAdata_average_WT, y = PRED)) +
    geom_point_rast(size = 0.5, alpha = 0.3) +
    geom_density_2d(color = "red", alpha = 0.5) +
    geom_line(data=newdata, aes(y = PRED, x = x_pred,label=PRED), color = "red", linetype = "dotted", linewidth=2 ) +
    geom_smooth(method = "lm") +
    geom_abline(intercept = 0, slope = 1) +
    scale_x_continuous(limits = c(-2, 3)) +
    scale_y_continuous(limits = c(-2, 3)) +
    annotate("text", x = -2, y = 1,
             label = paste("R2: ", round(correlation, 2), "\n", 
                          "CCC=", round(unlist(CCC_value[1])[1], 2), "\n",
                          "PEAR=", round(PEAR, 2), "\n",
                          "SPEAR=", round(SPEAR, 2), "\n",
                          paste("n=", nrow(pred_CDS_dataFIN), sep = ""), sep = ""),
             hjust = 0) +
    ggtitle(paste(scenarios[[scenario_name]]$name, "(log scale)")) +
    theme_bw() +
    scale_color_viridis_c() +
    theme(
      panel.grid.major = element_blank(),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 12),
      panel.grid.minor = element_blank(),
      aspect.ratio = 1
    )
## Warning in geom_line(data = newdata, aes(y = PRED, x = x_pred, label = PRED), :
## Ignoring unknown aesthetics: label
  print(p_log)
## `geom_smooth()` using formula = 'y ~ x'

  ggsave(p_log, filename = paste0("Prediction_vs_real_log.CDS.pdf"), width = 6, height = 6, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
    #determine calibration curve
  calibration_curve <- pred_CDS_dataFIN %>%
    mutate(bin = cut_interval(PRED, n=20)) %>%
    group_by(bin) %>%
    summarize(
      mean_predicted_probability = mean(PRED),
      observed_frequency = mean(sRNAdata_average_WT)
    )

  # Plot calibration curve
  p=calibration_curve %>% 
  ggplot(aes(y=mean_predicted_probability, x=observed_frequency))+
    geom_point()+
    geom_smooth(method = "lm", se = TRUE, color = "blue") +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
    labs(title = "Calibration Curve", x = "Mean Observed sRNA count", y = "Mean Predicted sRNA count") +
    theme_cowplot(14)+
    # scale_y_log10(limits=c(3,10000))+
    # scale_x_log10(limits=c(3,10000))+
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 12),
      axis.title =element_text(size = 12),
      aspect.ratio = 1
    )
  print(p)
## `geom_smooth()` using formula = 'y ~ x'

    # Predictions on test set (log scale)
  pred_CDS <- h2o.predict(yb_ml, test_CDS)
##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%
  pred_CDS_dataFIN <- as_tibble(test_CDS) %>%
    bind_cols(PRED = correction + as.vector(pred_CDS))
  
  correlation <- lm(pred_CDS_dataFIN$PRED ~ pred_CDS_dataFIN$sRNAdata_average_WT) %>% 
    summary() %>% .$r.squared
  CCC_value <- CCC(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED)
  PEAR <- cor.test(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED, method = "pearson")$estimate
  SPEAR <- SpearmanRho(pred_CDS_dataFIN$sRNAdata_average_WT, pred_CDS_dataFIN$PRED)

  
  
  #calculate lm for the offset calculation
  #here I switch to Real ~ Pred 
  fit <- lm( sRNAdata_average_WT ~PRED , data = pred_CDS_dataFIN)
  intercept <- fit$coefficients[1]
  slope <- fit$coefficients[2]  
  
  #calculate data to plot model in ggplot
  newdata <- data.frame(PRED = seq(min(pred_CDS_dataFIN$PRED), max(pred_CDS_dataFIN$PRED), length.out = 100))
  newdata$x_pred <- predict(fit, newdata)
  
  p_log <- pred_CDS_dataFIN %>%
    ggplot(aes(x = sRNAdata_average_WT, y = PRED)) +
    geom_point_rast(size = 0.5, alpha = 0.3) +
    geom_density_2d(color = "red", alpha = 0.5) +
    # geom_line(data=newdata, aes(y = PRED, x = x_pred,label=PRED), color = "red", linetype = "dotted", linewidth=2 ) +
    geom_smooth(method = "loess", method.args = list(family = "symmetric"), 
            se = TRUE, color = "blue")+
    geom_abline(intercept = 0, slope = 1) +
    scale_x_continuous(limits = c(-2, 3)) +
    scale_y_continuous(limits = c(-2, 3)) +
    annotate("text", x = -2, y = 1,
             label = paste("R2: ", round(correlation, 2), "\n", 
                          "CCC=", round(unlist(CCC_value[1])[1], 2), "\n",
                          "PEAR=", round(PEAR, 2), "\n",
                          "SPEAR=", round(SPEAR, 2), "\n",
                          paste("n=", nrow(pred_CDS_dataFIN), sep = ""), sep = ""),
             hjust = 0) +
    ggtitle(paste(scenarios[[scenario_name]]$name, "(log scale)")) +
    theme_bw() +
    scale_color_viridis_c() +
    theme(
      panel.grid.major = element_blank(),
      axis.text = element_text(size = 12),
      axis.title = element_text(size = 12),
      panel.grid.minor = element_blank(),
      aspect.ratio = 1
    )
  
  print(p_log)
## `geom_smooth()` using formula = 'y ~ x'

    ggsave(p_log, filename = paste0("Prediction_vs_real_log.CDS.corrected.pdf"), width = 6, height = 6, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'

############################################################################################### ############################################################################################### ############################################################################################### # ########################### plot data for ramp-up ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # model on siYB tiles

load data

load data and perform basic reformatting and filtering

 # load data
RAW = read_tsv("biogEFF_tiles.100_-600_750.txt", col_names = TRUE)%>%
  #remove lost YB library
  select(-contains("243686"))%>%
  mutate(
    TYPE = case_when(
      grepl("CDS", CHR) ~ "CDS",
      grepl("UTR", CHR) ~ "UTR",
      grepl("CLUSTER", CHR) ~ "CLUSTER",
      TRUE ~ "OTHER"
    ),
  )
## Rows: 73254 Columns: 113
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (3): CHR, FBgn, STRAND
## dbl (110): START, STOP, X, thickSTART, thickSTOP, nPOS, RNAseq_RPKM, TTseq_R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  {}
## NULL
# filter data
TABLEfilt= RAW %>%
  filter(TYPE == "UTR"  )%>%
  dplyr::select(-contains("ARMI"))%>%
  #filters implemented for the study
  filter(UNIQ > 50, RNAseq_RPKM>5,TTseq_RPKM>5, nPOS>0, UTR_LENGTH>700 ) %>%
  {}

nLINES=nrow(TABLEfilt)


#calculate RNAseq/TTseq corrected smallRNA and CLIP counts
TABLEfilt <- TABLEfilt %>%
  mutate(
    across(
      starts_with(c("sRNAdata_")),
      ~.x / TTseq_RPKM ,
      .names = "TTnorm_{.col}"
    ),
    across(
      starts_with(c("sRNAdata_")),
      ~.x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    ),
    across(
      starts_with(c("CLIP_")),
      ~.x / RNAseq_RPKM,
      .names = "RNAnorm_{.col}"
    )
  )
plotTABLE = TABLEfilt %>% 
    select(FBgn, nPOS, TTseq_RPKM, RNAseq_RPKM,  contains("sRNAdata_average-WT")) %>%
    separate(FBgn, c("ID"), sep = ":!:", remove = FALSE, convert = TRUE) %>%
    filter(nPOS<=40)%>%
    group_by(ID) %>%
    mutate(
      scaledLEVEL = `sRNAdata_average-WT` / max(`sRNAdata_average-WT`, na.rm = TRUE )
    )

# Create a summary table with counts
count_table <- plotTABLE %>%
  group_by(nPOS) %>%
  summarise(n = n(), .groups = 'drop')

p = plotTABLE %>% 
  ggplot(aes(x=as.factor(nPOS), y=`TTnorm_sRNAdata_average-WT`))+
  geom_boxplot(aes(x=as.factor(nPOS)), outlier.size = 0.1 , linewidth=0.2)+
  geom_smooth(aes(x=nPOS) )+
  scale_y_log10()+
  # scale_y_continuous(limits = c(-0.1, 1)) +
  geom_text(data = count_table, 
            aes(x = as.factor(nPOS), y = 0.01, label = paste("n =", n)), 
            size = 3, angle = 90, hjust = 1) +
  theme_cowplot(14) +
  theme(
    legend.position = "none"
  )

print(p)

ggsave("WT_100nt-tiles_ramp-up.biogEFF.pdf", p, width = 6, height = 4, dpi = 300)